import scrapy
from scrapy.linkextractors import LinkExtractor
from ..items import HeartItem
import pymongo
from scrapy.conf import settings



class AllSpider(scrapy.Spider):
    name = "all_spider"
    start_urls = ["http://www.dxy.cn/"]

    def __init__(self):
        self.client = pymongo.MongoClient(
            host= settings['MONGO_HOST'],
            port= settings['MONGO_PORT'])
        self.db = self.client[settings['MONGO_DB']]
        self.lastTime = self.db[settings['MONGO_TIME_COLL']]
    '''
    对首页导航栏中的大类发起请求，将相应传给parse_main_tag
    '''
    def parse(self, response):
        # 抓取首页导航中所有大类的入口链接
        le = LinkExtractor(restrict_xpaths="//div[@class='navlst2']")
        for link in le.extract_links(response):
            yield scrapy.Request(link.url, callback=self.parse_main_tag)


    '''
    向热门标签（各小类）中的链接发起请求，将相应传给parse_articles
    '''
    def parse_main_tag(self, response):
        # 抓取当前页面热门标签
        hot_tag = LinkExtractor(restrict_xpaths="//div[@class='el_sidenav1 x_nav1']")
        for link in hot_tag.extract_links(response):
            tag2 = link.text
            meta = {
                "tag2": tag2
            }
            yield scrapy.Request(link.url, callback=self.parse_articles, meta=meta)

    '''
    由热门标签进入的页面，直接爬取当前首页的文章
    并将下一页信息
    '''
    def parse_articles(self, response):
        # 抓取当前页面的文章链接
        meta = response.meta
        article_links = response.xpath("//p[contains(@class,'title')]/a[contains(@class,'h4')]/@href").extract()
        article_time = response.xpath("//p[contains(@class,'title')]/span[contains(@class,'fr')]/text()").extract()
        count = 0
        last_times = self.lastTime.count({})
        res = self.lastTime.find_one({'times': last_times})
        if res is None:
            last_time = '1900.01.01'
        else:
            last_time = res["date"]
        for article_link in article_links:
            if article_time[count] >= last_time:
                count += 1
                yield scrapy.Request(article_link, callback=self.parse_single_article, meta=meta)
            else:
                return
        # 下一页链接
        links = response.xpath("//div[contains(@class,'el_page x_page1')]//a[contains(@title,'下一页')]/@href").extract()
        if links:
            next_link = links[0]
            yield scrapy.Request(next_link, callback=self.parse_articles, meta=meta)

    '''
    文章最终的解析方法
    '''
    def parse_single_article(self, response):
        tag_dict = {}
        tag_list = []
        try:
            article_items = HeartItem()
            meta = response.meta
            tag1 = response.xpath("string(//a[contains(@class,'channel_name')])").extract()[0]
            tag2 = meta["tag2"]
            tag_list.append(tag2)
            tag_dict["%s" % tag1] = tag_list
            article_items["tag1"] = tag1
            article_items["tag"] = tag_dict
            article_items["url"] = response.url
            print(response.url)
            article_items["article_id"] = response.url.split('/')[-1]
            article_items["title"] = response.xpath("//h1/text()").extract()[0]
            article_items["date"] = response.xpath("//div[contains(@class,'sum')]/span/text()").extract()[0].strip()
            article_items["source"] = response.xpath("string(//div[contains(@class,'sum')]/span[2])").extract()[0]
            article_items["author"] = response.xpath("//div[contains(@class,'sum')]/span/text()").extract()[2].strip()
            article_items["content"] = response.xpath("//div[@id='content']/p/descendant::text()").extract()
        finally:
            yield article_items
